import numpy as np
import pandas as pd
#Data Reader from Internet
#pip install pandas-datareader
import matplotlib.pyplot as plt
import seaborn as sns
#visualization inside Jupyter Notebook
%matplotlib inline
#display image in Jupyter Notebook
from IPython.display import Image
###################
# Interative plots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# get version
from plotly import __version__
#print(__version__)
import cufflinks as cf
# For Notebooks
init_notebook_mode(connected=True)
# For offline use
cf.go_offline()
#[[Plotly "after May 2020"
# + pip install chart-studio
#import chart-studio.plotly as py
###################
# Machine Learning (pip install scikit-learn)
#from sklearn.la_famille_de_modeles import le_modele
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
##try a bunch of combinations and see what works best!
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import confusion_matrix
#from sklearn.datasets import load_boston #deprecated
# BUT other available (load_breast_cancer, load....)
###################
# NLP - Natural Language Processing (pip install nltk)
import nltk # Natural Language Toolkit
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
###################
# PIPELINE
from sklearn.pipeline import Pipeline
###################
# Artificial Neural Networks (ANN) (pip install tensorflow)
# MatPlotLib rearrange display
fig.tight_layout()
# or
plt.tight_layout()
# SeaBorn Load dataset
tips = sns.load_dataset('tips')
# import data using separator
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df_movies = pd.read_csv('datas/u.data', sep='\t', names=column_names)
df_movies.head()
| user_id | item_id | rating | timestamp | |
|---|---|---|---|---|
| 0 | 0 | 50 | 5 | 881250949 |
| 1 | 0 | 172 | 5 | 881250949 |
| 2 | 0 | 133 | 1 | 881250949 |
| 3 | 196 | 242 | 3 | 881250949 |
| 4 | 186 | 302 | 3 | 891717742 |
movie_titles.head()
| item_id | title | |
|---|---|---|
| 0 | 1 | Toy Story (1995) |
| 1 | 2 | GoldenEye (1995) |
| 2 | 3 | Four Rooms (1995) |
| 3 | 4 | Get Shorty (1995) |
| 4 | 5 | Copycat (1995) |
#merge datasets
movie_titles = pd.read_csv("datas/Movie_Id_Titles")
movie_titles.head()
df_movies = pd.merge(df_movies,movie_titles,on='item_id')
# if merge using index columns ==> can use join
# df_movies = df_movies.join(movie_titles['title'])
df_movies.head()
| user_id | item_id | rating | timestamp | title | |
|---|---|---|---|---|---|
| 0 | 0 | 50 | 5 | 881250949 | Star Wars (1977) |
| 1 | 290 | 50 | 5 | 880473582 | Star Wars (1977) |
| 2 | 79 | 50 | 4 | 891271545 | Star Wars (1977) |
| 3 | 2 | 50 | 5 | 888552084 | Star Wars (1977) |
| 4 | 8 | 50 | 5 | 879362124 | Star Wars (1977) |
# mean group by rating
df_movies.groupby('title')['rating'].mean().sort_values(ascending=False).head()
title They Made Me a Criminal (1939) 5.0 Marlene Dietrich: Shadow and Light (1996) 5.0 Saint of Fort Washington, The (1993) 5.0 Someone Else's America (1995) 5.0 Star Kid (1997) 5.0 Name: rating, dtype: float64
df_movies.groupby('title')['rating'].count().sort_values(ascending=False).head(4)
title Star Wars (1977) 584 Contact (1997) 509 Fargo (1996) 508 Return of the Jedi (1983) 507 Name: rating, dtype: int64
df_movies[df_movies['title']=='Star Wars (1977)'].groupby(
['title','rating'])['item_id'].count()
title rating
Star Wars (1977) 1 9
2 16
3 57
4 176
5 326
Name: item_id, dtype: int64
# top 10 movies with 5 stars
df_movies[df_movies['rating']==5].groupby(
['title','rating'])['item_id'].count().sort_values(ascending=False).head(10)
title rating Star Wars (1977) 5 326 Fargo (1996) 5 227 Godfather, The (1972) 5 214 Raiders of the Lost Ark (1981) 5 202 Pulp Fiction (1994) 5 188 Schindler's List (1993) 5 186 Silence of the Lambs, The (1991) 5 181 Titanic (1997) 5 179 Empire Strikes Back, The (1980) 5 173 Return of the Jedi (1983) 5 171 Name: item_id, dtype: int64
ratings = pd.DataFrame(df_movies.groupby('title')['rating'].mean())
ratings['num of ratings'] = pd.DataFrame(df_movies.groupby('title')['rating'].count())
ratings.sort_values(by='num of ratings', ascending=False).head(10)
| rating | num of ratings | |
|---|---|---|
| title | ||
| Star Wars (1977) | 4.359589 | 584 |
| Contact (1997) | 3.803536 | 509 |
| Fargo (1996) | 4.155512 | 508 |
| Return of the Jedi (1983) | 4.007890 | 507 |
| Liar Liar (1997) | 3.156701 | 485 |
| English Patient, The (1996) | 3.656965 | 481 |
| Scream (1996) | 3.441423 | 478 |
| Toy Story (1995) | 3.878319 | 452 |
| Air Force One (1997) | 3.631090 | 431 |
| Independence Day (ID4) (1996) | 3.438228 | 429 |
#simple group by using all mean on all numeric_values
yelp = pd.read_csv('datas/yelp.csv')
yelp['text length'] = yelp['text'].apply(len)
stars = yelp.groupby(by='stars').mean(numeric_only=True)
stars
| cool | useful | funny | text length | |
|---|---|---|---|---|
| stars | ||||
| 1 | 0.576769 | 1.604806 | 1.056075 | 826.515354 |
| 2 | 0.719525 | 1.563107 | 0.875944 | 842.256742 |
| 3 | 0.788501 | 1.306639 | 0.694730 | 758.498289 |
| 4 | 0.954623 | 1.395916 | 0.670448 | 712.923142 |
| 5 | 0.944261 | 1.381780 | 0.608631 | 624.999101 |
train = pd.read_csv('datas/titanic_train.csv') #index_col=0)
train # 891 x 12
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 24.0 | 1 | 2 | W./C. 6607 | 23.4500 | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | Q |
891 rows × 11 columns
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
train.describe()
| PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
train.groupby('Survived').describe()
| PassengerId | Pclass | ... | Parch | Fare | |||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | ... | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| Survived | |||||||||||||||||||||
| 0 | 549.0 | 447.016393 | 260.640469 | 1.0 | 211.00 | 455.0 | 675.0 | 891.0 | 549.0 | 2.531876 | ... | 0.0 | 6.0 | 549.0 | 22.117887 | 31.388207 | 0.0 | 7.8542 | 10.5 | 26.0 | 263.0000 |
| 1 | 342.0 | 444.368421 | 252.358840 | 2.0 | 250.75 | 439.5 | 651.5 | 890.0 | 342.0 | 1.950292 | ... | 1.0 | 5.0 | 342.0 | 48.395408 | 66.596998 | 0.0 | 12.4750 | 26.0 | 57.0 | 512.3292 |
2 rows × 48 columns
##################
# show null datas
##################
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')
<AxesSubplot: >
####################################
# drop a column
####################################
train.drop('Cabin',axis=1 #axis=0 for rows, axis=1 for columns
,inplace=True) #inplace to apply on data train (not a copy)
train # 891 x 11
####################################
# fill missing values (here Age replaced by mean age in Pclass)
####################################
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
if pd.isnull(Age):
if Pclass == 1:
return 37
elif Pclass == 2:
return 29
else:
return 24
else:
return Age
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)
train
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 24.0 | 1 | 2 | W./C. 6607 | 23.4500 | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | Q |
891 rows × 11 columns
####################################
# drop null values (DROPS THE ROW containing null values)
####################################
train.dropna(inplace=True)
train # 889 x 11
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 24.0 | 1 | 2 | W./C. 6607 | 23.4500 | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | Q |
889 rows × 11 columns
train.drop('male',axis=1 #axis=0 for rows, axis=1 for columns
,inplace=True)
train
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 24.0 | 1 | 2 | W./C. 6607 | 23.4500 | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | Q |
889 rows × 11 columns
####################################
# replace non numeric values (short)
####################################
loans = pd.read_csv('datas/loan_data.csv')
#cat_feats = ['purpose']
final_data = pd.get_dummies(loans,columns=['purpose']) # ,drop_first=True
loans['purpose'].unique()
array(['debt_consolidation', 'credit_card', 'all_other',
'home_improvement', 'small_business', 'major_purchase',
'educational'], dtype=object)
final_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9578 entries, 0 to 9577 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 credit.policy 9578 non-null int64 1 int.rate 9578 non-null float64 2 installment 9578 non-null float64 3 log.annual.inc 9578 non-null float64 4 dti 9578 non-null float64 5 fico 9578 non-null int64 6 days.with.cr.line 9578 non-null float64 7 revol.bal 9578 non-null int64 8 revol.util 9578 non-null float64 9 inq.last.6mths 9578 non-null int64 10 delinq.2yrs 9578 non-null int64 11 pub.rec 9578 non-null int64 12 not.fully.paid 9578 non-null int64 13 purpose_all_other 9578 non-null uint8 14 purpose_credit_card 9578 non-null uint8 15 purpose_debt_consolidation 9578 non-null uint8 16 purpose_educational 9578 non-null uint8 17 purpose_home_improvement 9578 non-null uint8 18 purpose_major_purchase 9578 non-null uint8 19 purpose_small_business 9578 non-null uint8 dtypes: float64(6), int64(7), uint8(7) memory usage: 1.0 MB
loans.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9578 entries, 0 to 9577 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 credit.policy 9578 non-null int64 1 purpose 9578 non-null object 2 int.rate 9578 non-null float64 3 installment 9578 non-null float64 4 log.annual.inc 9578 non-null float64 5 dti 9578 non-null float64 6 fico 9578 non-null int64 7 days.with.cr.line 9578 non-null float64 8 revol.bal 9578 non-null int64 9 revol.util 9578 non-null float64 10 inq.last.6mths 9578 non-null int64 11 delinq.2yrs 9578 non-null int64 12 pub.rec 9578 non-null int64 13 not.fully.paid 9578 non-null int64 dtypes: float64(6), int64(7), object(1) memory usage: 1.0+ MB
####################################
# replace non numeric values
####################################
pd.get_dummies(train['Sex']) # returns tab with columns = values and rows = true/false (1/0)
# for sex it's male or female, so we can drop first column
pd.get_dummies(train['Sex'],drop_first=True)
# save in tab
sex = pd.get_dummies(train['Sex'],drop_first=True)
# add it to our data
train = pd.concat([train,sex],axis=1)
train
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S | 1 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C | 0 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | 0 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S | 0 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | S | 1 |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | S | 0 |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 24.0 | 1 | 2 | W./C. 6607 | 23.4500 | S | 0 |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C | 1 |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | Q | 1 |
889 rows × 12 columns
train['Embarked'].unique() #unique values in a column
array(['S', 'C', 'Q'], dtype=object)
pd.get_dummies(train['Embarked'],drop_first=True)
| Q | S | |
|---|---|---|
| 0 | 0 | 1 |
| 1 | 0 | 0 |
| 2 | 0 | 1 |
| 3 | 0 | 1 |
| 4 | 0 | 1 |
| ... | ... | ... |
| 886 | 0 | 1 |
| 887 | 0 | 1 |
| 888 | 0 | 1 |
| 889 | 0 | 0 |
| 890 | 1 | 0 |
889 rows × 2 columns
# save in tab
embarked = pd.get_dummies(train['Embarked'],drop_first=True)
# add it to our data
train = pd.concat([train,embarked],axis=1)
train.drop('Embarked',axis=1,inplace=True)
train
| Survived | Pclass | Age | SibSp | Parch | Fare | male | Q | S | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 22.0 | 1 | 0 | 7.2500 | 1 | 0 | 1 |
| 1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | 0 | 0 | 0 |
| 2 | 1 | 3 | 26.0 | 0 | 0 | 7.9250 | 0 | 0 | 1 |
| 3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | 0 | 0 | 1 |
| 4 | 0 | 3 | 35.0 | 0 | 0 | 8.0500 | 1 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | 27.0 | 0 | 0 | 13.0000 | 1 | 0 | 1 |
| 887 | 1 | 1 | 19.0 | 0 | 0 | 30.0000 | 0 | 0 | 1 |
| 888 | 0 | 3 | 24.0 | 1 | 2 | 23.4500 | 0 | 0 | 1 |
| 889 | 1 | 1 | 26.0 | 0 | 0 | 30.0000 | 1 | 0 | 0 |
| 890 | 0 | 3 | 32.0 | 0 | 0 | 7.7500 | 1 | 1 | 0 |
889 rows × 9 columns
train.drop('Q',axis=1,inplace=True)
train
| Survived | Pclass | Age | SibSp | Parch | Fare | Embarked | male | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 22.0 | 1 | 0 | 7.2500 | S | 1 |
| 1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | C | 0 |
| 2 | 1 | 3 | 26.0 | 0 | 0 | 7.9250 | S | 0 |
| 3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | S | 0 |
| 4 | 0 | 3 | 35.0 | 0 | 0 | 8.0500 | S | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | 27.0 | 0 | 0 | 13.0000 | S | 1 |
| 887 | 1 | 1 | 19.0 | 0 | 0 | 30.0000 | S | 0 |
| 888 | 0 | 3 | 24.0 | 1 | 2 | 23.4500 | S | 0 |
| 889 | 1 | 1 | 26.0 | 0 | 0 | 30.0000 | C | 1 |
| 890 | 0 | 3 | 32.0 | 0 | 0 | 7.7500 | Q | 1 |
889 rows × 8 columns
####################################
# dates - timestamps treatment
####################################
df['timeStamp']=pd.to_datetime(df['timeStamp']) # convert stringSeries to Time
df['Hour']=df['timeStamp'].apply(lambda t : t.hour)
df['Month']=df['timeStamp'].apply(lambda t : t.month)
df['Day of Week']=df['timeStamp'].apply(lambda t : t.dayofweek)
dmap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}
df['Day of Week']=df['Day of Week'].map(dmap)
# fast cross checking datas
sns.pairplot(data=df,hue='COL_NAME',palette='bwr') # parametres bidon
####################################
# incrusted plot
####################################
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_axes([0,0,1,1])
ax2 = fig.add_axes([.3,.3,.6,.5])
ax1.hist(train['Fare'],bins=30) # histogram of all data
ax2.hist(train[(train['Fare']<60)]['Fare'],bins=50) # incrusted zoom on Fares < 60 more populated
(array([ 15., 0., 0., 1., 1., 25., 247., 40., 35., 9., 55.,
10., 28., 28., 5., 8., 8., 19., 5., 8., 10., 39.,
28., 16., 8., 18., 14., 5., 6., 6., 2., 2., 5.,
10., 3., 1., 0., 0., 0., 7., 0., 3., 2., 10.,
8., 0., 3., 13., 2., 1.]),
array([ 0. , 1.188, 2.376, 3.564, 4.752, 5.94 , 7.128, 8.316,
9.504, 10.692, 11.88 , 13.068, 14.256, 15.444, 16.632, 17.82 ,
19.008, 20.196, 21.384, 22.572, 23.76 , 24.948, 26.136, 27.324,
28.512, 29.7 , 30.888, 32.076, 33.264, 34.452, 35.64 , 36.828,
38.016, 39.204, 40.392, 41.58 , 42.768, 43.956, 45.144, 46.332,
47.52 , 48.708, 49.896, 51.084, 52.272, 53.46 , 54.648, 55.836,
57.024, 58.212, 59.4 ]),
<BarContainer object of 50 artists>)
loans = pd.read_csv('datas/loan_data.csv')
####################################
# 2 histograms on same plot
####################################
b=25
a=0.5
# variable column
col='not.fully.paid'
#col='credit.policy'
plt.figure(figsize=(12,6))
loans[loans[col]==1]['fico'].hist(label=col+'=1',bins=b,alpha=a,color='red')
loans[loans[col]==0]['fico'].hist(label=col+'=0',bins=b,alpha=a,color='blue')
plt.legend()
plt.xlabel('FICO')
Text(0.5, 0, 'FICO')
College_Data = pd.read_csv('datas/College_Data')
####################################
# 2 histograms on same plot
####################################
sns.set_style('darkgrid')
g = sns.FacetGrid(data=College_Data, hue="Private",
palette='coolwarm', height=6,
#size=6, #deprecated
aspect=2)
g = g.map(plt.hist,'Outstate',bins=20,alpha=0.7).add_legend()
messages = pd.read_csv('datas/SMSSpamCollection',
sep='\t',
names=["label", "message"])
####################################
# 2 histograms differents plots
####################################
messages['length'] = messages['message'].apply(len)
messages.hist(column='length', by='label', bins=50,figsize=(12,4))
array([<AxesSubplot: title={'center': 'ham'}>,
<AxesSubplot: title={'center': 'spam'}>], dtype=object)
# grid of histograms
yelp = pd.read_csv('datas/yelp.csv')
yelp['text length'] = yelp['text'].apply(len)
g = sns.FacetGrid(data=yelp, col='stars')
g.map(plt.hist, 'text length')
<seaborn.axisgrid.FacetGrid at 0x15b0f3ff580>
loans = pd.read_csv('datas/loan_data.csv')
sns.lmplot(data=loans,x='fico',y='int.rate',
hue='credit.policy',col='not.fully.paid')
<seaborn.axisgrid.FacetGrid at 0x2a0603c97b0>
ad_data = pd.read_csv('datas/advertising.csv')
sns.pairplot(data=ad_data,hue='Clicked on Ad',palette='bwr')
<seaborn.axisgrid.PairGrid at 0x2a061023c10>
####################################
#interactive version
####################################
import cufflinks as cf
cf.go_offline()
train[train['Fare']<60]['Fare'].iplot(kind='hist',bins=50,color='green')
Image(filename='imgs/13-Logistic-Regression--01-Logistic Regression with Python--iplot-hist.png')
####################################
# Multiple interative plots
####################################
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import __version__
print(__version__)
5.11.0
#hist1 = go.Histogram(x=train['Fare'],name='All Fares',)
hist1 = go.Bar()
hist2 = go.Histogram(x=train[train['Fare']<60]['Fare'],name='Fare < 60')
fig1 = go.Figure(data=hist1)
fig2 = go.Figure(data=hist2)
figs = cf.subplots([fig1, fig2],shape=(1,2))
iplot(figs)
# http://localhost:8888/notebooks/Refactored_Py_DS_ML_Bootcamp-master/13-Logistic-Regression/02-Logistic%20Regression%20Project.ipynb
#
ad_data = pd.read_csv('datas/advertising.csv')
sns.jointplot(data=ad_data,x='Age',y='Daily Time Spent on Site',
kind='kde',color='red',fill=True,
marginal_kws=dict(alpha=0.1))
<seaborn.axisgrid.JointGrid at 0x1bcf8378400>
Image(filename='imgs/13-Logistic-Regression--02-Logistic Regression Project--JoinPlot.png')
train.drop(['PassengerId','Name','Sex','Ticket'],axis=1,inplace=True)
pd.get_dummies(train['Embarked'],drop_first=True)
| Q | S | |
|---|---|---|
| 0 | 0 | 1 |
| 1 | 0 | 0 |
| 2 | 0 | 1 |
| 3 | 0 | 1 |
| 4 | 0 | 1 |
| ... | ... | ... |
| 886 | 0 | 1 |
| 887 | 0 | 1 |
| 888 | 0 | 1 |
| 889 | 0 | 0 |
| 890 | 1 | 0 |
889 rows × 2 columns
######## CONVERT LINEAR Regression to LOGISTIC Regression ########
Image('imgs/linear-to-logistic-1.JPG')
Image('imgs/linear-to-logistic-2.JPG')
######## LogisticRegression ########
#####################################
# Prepare trainings and tests data
#####################################
from sklearn.model_selection import train_test_split
y = train['Survived']
X = train.drop('Survived',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,
test_size=0.30,
random_state=101)
#####################################
# Train model
#####################################
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression() # create instance of Logisitc model
logmodel.fit(X=X_train,y=y_train)
#####################################
# Run predictions
#####################################
predictions = logmodel.predict(X_test)
######## K Nearest Neighbors (KNN) ########
df = pd.read_csv('datas/KNN_Project_Data')
#####################################
# Prepare trainings and tests data
#####################################
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
scaler = StandardScaler()
# fill scaler values
scaler.fit(df.drop('TARGET CLASS',axis=1))
#scale features to standardize everything to the same scale
#neither values with large scale will impact differently the values with small scale
scaled_features = scaler.transform(df.drop('TARGET CLASS',axis=1))
#==> returns values around 0 [-1.... .. +1.....]
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
X=scaled_features
y=df['TARGET CLASS']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=101)
#####################################
# Train model
#####################################
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X=X_train, y=y_train)
#####################################
# Run predictions
#####################################
predictions = knn.predict(X_test)
predictions
#####################################
# Find Best K value
#####################################
error_rate = []
iMaxLoop = 60
for i in range(1,iMaxLoop) :
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i!=y_test))
print("**************************")
print("**classification_report** WITH K=",str(i))
print(classification_report(y_test,pred_i))
print("**confusion_matrix** WITH K=",str(i))
conf_matrix = confusion_matrix(y_test,pred_i)
print(conf_matrix)
FalNeg = conf_matrix[0,1]
FalPos = conf_matrix[1,0]
#print(confusion_matrix(y_test,pred))
#FN = confusion_matrix [1,1]
print("Errors :",str(FalNeg+FalPos))
plt.figure(figsize=(10,6))
plt.plot(range(1,iMaxLoop),error_rate,
color='blue',linestyle='dashed', #linestyle='--',
marker='o',markerfacecolor='red',markersize=10)
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.title('Error rate vs K value')
**************************
**classification_report** WITH K= 1
precision recall f1-score support
0 0.73 0.72 0.72 152
1 0.71 0.72 0.72 148
accuracy 0.72 300
macro avg 0.72 0.72 0.72 300
weighted avg 0.72 0.72 0.72 300
**confusion_matrix** WITH K= 1
[[109 43]
[ 41 107]]
Errors : 84
**************************
**classification_report** WITH K= 2
precision recall f1-score support
0 0.67 0.85 0.75 152
1 0.79 0.57 0.66 148
accuracy 0.71 300
macro avg 0.73 0.71 0.70 300
weighted avg 0.73 0.71 0.70 300
**confusion_matrix** WITH K= 2
[[129 23]
[ 64 84]]
Errors : 87
**************************
**classification_report** WITH K= 3
precision recall f1-score support
0 0.80 0.77 0.78 152
1 0.77 0.80 0.78 148
accuracy 0.78 300
macro avg 0.78 0.78 0.78 300
weighted avg 0.78 0.78 0.78 300
**confusion_matrix** WITH K= 3
[[117 35]
[ 30 118]]
Errors : 65
**************************
**classification_report** WITH K= 4
precision recall f1-score support
0 0.75 0.86 0.80 152
1 0.83 0.70 0.76 148
accuracy 0.78 300
macro avg 0.79 0.78 0.78 300
weighted avg 0.79 0.78 0.78 300
**confusion_matrix** WITH K= 4
[[130 22]
[ 44 104]]
Errors : 66
**************************
**classification_report** WITH K= 5
precision recall f1-score support
0 0.79 0.80 0.80 152
1 0.79 0.78 0.79 148
accuracy 0.79 300
macro avg 0.79 0.79 0.79 300
weighted avg 0.79 0.79 0.79 300
**confusion_matrix** WITH K= 5
[[122 30]
[ 32 116]]
Errors : 62
**************************
**classification_report** WITH K= 6
precision recall f1-score support
0 0.76 0.86 0.80 152
1 0.83 0.72 0.77 148
accuracy 0.79 300
macro avg 0.79 0.79 0.79 300
weighted avg 0.79 0.79 0.79 300
**confusion_matrix** WITH K= 6
[[130 22]
[ 41 107]]
Errors : 63
**************************
**classification_report** WITH K= 7
precision recall f1-score support
0 0.83 0.81 0.82 152
1 0.81 0.82 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 7
[[123 29]
[ 26 122]]
Errors : 55
**************************
**classification_report** WITH K= 8
precision recall f1-score support
0 0.80 0.84 0.82 152
1 0.83 0.78 0.80 148
accuracy 0.81 300
macro avg 0.81 0.81 0.81 300
weighted avg 0.81 0.81 0.81 300
**confusion_matrix** WITH K= 8
[[128 24]
[ 33 115]]
Errors : 57
**************************
**classification_report** WITH K= 9
precision recall f1-score support
0 0.81 0.81 0.81 152
1 0.81 0.81 0.81 148
accuracy 0.81 300
macro avg 0.81 0.81 0.81 300
weighted avg 0.81 0.81 0.81 300
**confusion_matrix** WITH K= 9
[[123 29]
[ 28 120]]
Errors : 57
**************************
**classification_report** WITH K= 10
precision recall f1-score support
0 0.81 0.86 0.83 152
1 0.84 0.79 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 10
[[130 22]
[ 31 117]]
Errors : 53
**************************
**classification_report** WITH K= 11
precision recall f1-score support
0 0.83 0.81 0.82 152
1 0.81 0.82 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 11
[[123 29]
[ 26 122]]
Errors : 55
**************************
**classification_report** WITH K= 12
precision recall f1-score support
0 0.81 0.84 0.82 152
1 0.83 0.79 0.81 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 12
[[128 24]
[ 31 117]]
Errors : 55
**************************
**classification_report** WITH K= 13
precision recall f1-score support
0 0.83 0.81 0.82 152
1 0.81 0.82 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 13
[[123 29]
[ 26 122]]
Errors : 55
**************************
**classification_report** WITH K= 14
precision recall f1-score support
0 0.81 0.84 0.82 152
1 0.83 0.80 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 14
[[127 25]
[ 29 119]]
Errors : 54
**************************
**classification_report** WITH K= 15
precision recall f1-score support
0 0.83 0.81 0.82 152
1 0.81 0.83 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 15
[[123 29]
[ 25 123]]
Errors : 54
**************************
**classification_report** WITH K= 16
precision recall f1-score support
0 0.81 0.84 0.82 152
1 0.83 0.80 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 16
[[127 25]
[ 29 119]]
Errors : 54
**************************
**classification_report** WITH K= 17
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 17
[[124 28]
[ 23 125]]
Errors : 51
**************************
**classification_report** WITH K= 18
precision recall f1-score support
0 0.82 0.84 0.83 152
1 0.83 0.82 0.82 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 18
[[127 25]
[ 27 121]]
Errors : 52
**************************
**classification_report** WITH K= 19
precision recall f1-score support
0 0.83 0.82 0.83 152
1 0.82 0.82 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 19
[[125 27]
[ 26 122]]
Errors : 53
**************************
**classification_report** WITH K= 20
precision recall f1-score support
0 0.82 0.82 0.82 152
1 0.82 0.81 0.81 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 20
[[125 27]
[ 28 120]]
Errors : 55
**************************
**classification_report** WITH K= 21
precision recall f1-score support
0 0.84 0.81 0.82 152
1 0.81 0.84 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 21
[[123 29]
[ 24 124]]
Errors : 53
**************************
**classification_report** WITH K= 22
precision recall f1-score support
0 0.82 0.82 0.82 152
1 0.81 0.82 0.81 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 22
[[124 28]
[ 27 121]]
Errors : 55
**************************
**classification_report** WITH K= 23
precision recall f1-score support
0 0.85 0.82 0.83 152
1 0.82 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 23
[[124 28]
[ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 24
precision recall f1-score support
0 0.82 0.82 0.82 152
1 0.82 0.82 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 24
[[125 27]
[ 27 121]]
Errors : 54
**************************
**classification_report** WITH K= 25
precision recall f1-score support
0 0.85 0.82 0.83 152
1 0.82 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 25
[[124 28]
[ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 26
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 26
[[125 27]
[ 24 124]]
Errors : 51
**************************
**classification_report** WITH K= 27
precision recall f1-score support
0 0.85 0.82 0.83 152
1 0.82 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 27
[[124 28]
[ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 28
precision recall f1-score support
0 0.83 0.82 0.83 152
1 0.82 0.83 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 28
[[125 27]
[ 25 123]]
Errors : 52
**************************
**classification_report** WITH K= 29
precision recall f1-score support
0 0.85 0.81 0.83 152
1 0.81 0.86 0.84 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 29
[[123 29]
[ 21 127]]
Errors : 50
**************************
**classification_report** WITH K= 30
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 30
[[124 28]
[ 24 124]]
Errors : 52
**************************
**classification_report** WITH K= 31
precision recall f1-score support
0 0.87 0.81 0.84 152
1 0.82 0.87 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 31
[[123 29]
[ 19 129]]
Errors : 48
**************************
**classification_report** WITH K= 32
precision recall f1-score support
0 0.85 0.82 0.83 152
1 0.82 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 32
[[124 28]
[ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 33
precision recall f1-score support
0 0.85 0.80 0.82 152
1 0.81 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 33
[[122 30]
[ 22 126]]
Errors : 52
**************************
**classification_report** WITH K= 34
precision recall f1-score support
0 0.84 0.81 0.83 152
1 0.81 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 34
[[123 29]
[ 23 125]]
Errors : 52
**************************
**classification_report** WITH K= 35
precision recall f1-score support
0 0.85 0.81 0.83 152
1 0.81 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 35
[[123 29]
[ 22 126]]
Errors : 51
**************************
**classification_report** WITH K= 36
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 36
[[125 27]
[ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 37
precision recall f1-score support
0 0.86 0.82 0.84 152
1 0.82 0.86 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 37
[[125 27]
[ 21 127]]
Errors : 48
**************************
**classification_report** WITH K= 38
precision recall f1-score support
0 0.85 0.83 0.84 152
1 0.83 0.84 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 38
[[126 26]
[ 23 125]]
Errors : 49
**************************
**classification_report** WITH K= 39
precision recall f1-score support
0 0.86 0.82 0.84 152
1 0.82 0.86 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 39
[[125 27]
[ 21 127]]
Errors : 48
**************************
**classification_report** WITH K= 40
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 40
[[125 27]
[ 24 124]]
Errors : 51
**************************
**classification_report** WITH K= 41
precision recall f1-score support
0 0.85 0.81 0.83 152
1 0.81 0.86 0.84 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 41
[[123 29]
[ 21 127]]
Errors : 50
**************************
**classification_report** WITH K= 42
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 42
[[125 27]
[ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 43
precision recall f1-score support
0 0.86 0.82 0.84 152
1 0.82 0.86 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 43
[[124 28]
[ 21 127]]
Errors : 49
**************************
**classification_report** WITH K= 44
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 44
[[124 28]
[ 24 124]]
Errors : 52
**************************
**classification_report** WITH K= 45
precision recall f1-score support
0 0.85 0.82 0.83 152
1 0.82 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 45
[[124 28]
[ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 46
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 46
[[125 27]
[ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 47
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 47
[[124 28]
[ 23 125]]
Errors : 51
**************************
**classification_report** WITH K= 48
precision recall f1-score support
0 0.84 0.83 0.83 152
1 0.83 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 48
[[126 26]
[ 24 124]]
Errors : 50
**************************
**classification_report** WITH K= 49
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 49
[[124 28]
[ 24 124]]
Errors : 52
**************************
**classification_report** WITH K= 50
precision recall f1-score support
0 0.84 0.83 0.83 152
1 0.83 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 50
[[126 26]
[ 24 124]]
Errors : 50
**************************
**classification_report** WITH K= 51
precision recall f1-score support
0 0.85 0.83 0.84 152
1 0.83 0.84 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 51
[[126 26]
[ 23 125]]
Errors : 49
**************************
**classification_report** WITH K= 52
precision recall f1-score support
0 0.84 0.84 0.84 152
1 0.83 0.84 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 52
[[127 25]
[ 24 124]]
Errors : 49
**************************
**classification_report** WITH K= 53
precision recall f1-score support
0 0.85 0.83 0.84 152
1 0.83 0.85 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 53
[[126 26]
[ 22 126]]
Errors : 48
**************************
**classification_report** WITH K= 54
precision recall f1-score support
0 0.84 0.84 0.84 152
1 0.84 0.84 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 54
[[128 24]
[ 24 124]]
Errors : 48
**************************
**classification_report** WITH K= 55
precision recall f1-score support
0 0.85 0.82 0.84 152
1 0.82 0.85 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 55
[[125 27]
[ 22 126]]
Errors : 49
**************************
**classification_report** WITH K= 56
precision recall f1-score support
0 0.83 0.82 0.83 152
1 0.82 0.83 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 56
[[125 27]
[ 25 123]]
Errors : 52
**************************
**classification_report** WITH K= 57
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 57
[[125 27]
[ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 58
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 58
[[125 27]
[ 24 124]]
Errors : 51
**************************
**classification_report** WITH K= 59
precision recall f1-score support
0 0.85 0.81 0.83 152
1 0.81 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 59
[[123 29]
[ 22 126]]
Errors : 51
Text(0.5, 1.0, 'Error rate vs K value')
######## Decision Trees and Random Forests ########
df = pd.read_csv('datas/kyphosis.csv')
#####################################
from sklearn.model_selection import train_test_split
X = df.drop('Kyphosis',axis=1)
y = df['Kyphosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print("**classification_report**",)
print(classification_report(y_test,predictions))
print("**confusion_matrix**")
print(confusion_matrix(y_test,predictions))
**classification_report**
precision recall f1-score support
absent 0.88 0.75 0.81 20
present 0.38 0.60 0.46 5
accuracy 0.72 25
macro avg 0.63 0.68 0.64 25
weighted avg 0.78 0.72 0.74 25
**confusion_matrix**
[[15 5]
[ 2 3]]
#####################################
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print("**classification_report**",)
print(classification_report(y_test,rfc_pred))
print("**confusion_matrix**")
print(confusion_matrix(y_test,rfc_pred))
**classification_report**
precision recall f1-score support
absent 0.86 0.95 0.90 20
present 0.67 0.40 0.50 5
accuracy 0.84 25
macro avg 0.77 0.68 0.70 25
weighted avg 0.82 0.84 0.82 25
**confusion_matrix**
[[19 1]
[ 3 2]]
######## Support Vector Machines (SVM) ########
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix
cancer = load_breast_cancer()
df_feat = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
df_target = pd.DataFrame(cancer['target'],columns=['Cancer'])
X_train, X_test, y_train, y_test = train_test_split(
df_feat, np.ravel(df_target), test_size=0.30, random_state=101)
model = SVC()
model.fit(X_train,y_train)
predictions = model.predict(X_test)
print("**classification_report**",)
print(classification_report(y_test,predictions))
print("**confusion_matrix**")
print(confusion_matrix(y_test,predictions))
**classification_report**
precision recall f1-score support
0 0.95 0.85 0.90 66
1 0.91 0.97 0.94 105
accuracy 0.92 171
macro avg 0.93 0.91 0.92 171
weighted avg 0.93 0.92 0.92 171
**confusion_matrix**
[[ 56 10]
[ 3 102]]
Finding the right parameters (like what C or gamma values to use) is a tricky task! But luckily, we can be a little lazy and just try a bunch of combinations and see what works best! This idea of creating a 'grid' of parameters and just trying out all the possible combinations is called a Gridsearch, this method is common enough that Scikit-learn has this functionality built in with GridSearchCV! The CV stands for cross-validation which is the
GridSearchCV takes a dictionary that describes the parameters that should be tried and a model to train. The grid of parameters is defined as a dictionary, where the keys are the parameters and the values are the settings to be tested.
One of the great things about GridSearchCV is that it is a meta-estimator. It takes an estimator like SVC, and creates a new estimator, that behaves exactly the same - in this case, like a classifier. You should add refit=True and choose verbose to whatever number you want, higher the number, the more verbose (verbose just means the text output describing the process).
What fit does is a bit more involved then usual. First, it runs the same loop with cross-validation, to find the best parameter combination. Once it has the best combination, it runs fit again on all data passed to fit (without cross-validation), to built a single new model using the best parameter setting.
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001],
'kernel': ['rbf']}
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
# May take awhile!
grid.fit(X_train,y_train)
Fitting 5 folds for each of 25 candidates, totalling 125 fits [CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.887 total time= 0.0s [CV 2/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.938 total time= 0.0s [CV 3/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.963 total time= 0.0s [CV 4/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.962 total time= 0.0s [CV 5/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.886 total time= 0.0s [CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.900 total time= 0.0s [CV 2/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.912 total time= 0.0s [CV 3/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.925 total time= 0.0s [CV 4/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.962 total time= 0.0s [CV 5/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.937 total time= 0.0s [CV 1/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.912 total time= 0.0s [CV 2/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.950 total time= 0.0s [CV 3/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.975 total time= 0.0s [CV 4/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.962 total time= 0.0s [CV 5/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.937 total time= 0.0s [CV 1/5] END .........C=10, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END .........C=10, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END .........C=10, gamma=1, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END .........C=10, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END .........C=10, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.613 total time= 0.0s [CV 4/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.887 total time= 0.0s [CV 2/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.912 total time= 0.0s [CV 3/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.900 total time= 0.0s [CV 4/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.937 total time= 0.0s [CV 5/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.924 total time= 0.0s [CV 1/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.950 total time= 0.0s [CV 2/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.912 total time= 0.0s [CV 3/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.975 total time= 0.0s [CV 4/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.949 total time= 0.0s [CV 5/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.949 total time= 0.0s [CV 1/5] END ........C=100, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ........C=100, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ........C=100, gamma=1, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END ........C=100, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ........C=100, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.613 total time= 0.0s [CV 4/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.887 total time= 0.0s [CV 2/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.912 total time= 0.0s [CV 3/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.900 total time= 0.0s [CV 4/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.937 total time= 0.0s [CV 5/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.924 total time= 0.0s [CV 1/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.925 total time= 0.0s [CV 2/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.912 total time= 0.0s [CV 3/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.975 total time= 0.0s [CV 4/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.937 total time= 0.0s [CV 5/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.949 total time= 0.0s [CV 1/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.613 total time= 0.0s [CV 4/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.887 total time= 0.0s [CV 2/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.912 total time= 0.0s [CV 3/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.900 total time= 0.0s [CV 4/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.937 total time= 0.0s [CV 5/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.924 total time= 0.0s [CV 1/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.938 total time= 0.0s [CV 2/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.912 total time= 0.0s [CV 3/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.963 total time= 0.0s [CV 4/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.924 total time= 0.0s [CV 5/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.962 total time= 0.0s
GridSearchCV(estimator=SVC(),
param_grid={'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']},
verbose=3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=SVC(),
param_grid={'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']},
verbose=3)SVC()
SVC()
# You can inspect the best parameters found by GridSearchCV in the best_params_ attribute,
#and the best estimator in the best_estimator_ attribute:
print(grid.best_estimator_)
print('\n')
print(grid.best_params_)
print('\n')
print(grid.best_score_)
SVC(C=1, gamma=0.0001)
{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.9472468354430379
# Check predictions results of the grid
grid_predictions = grid.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import confusion_matrix
print("**classification_report**",)
print(classification_report(y_test,grid_predictions))
print("**confusion_matrix**")
print(confusion_matrix(y_test,grid_predictions))
**classification_report**
precision recall f1-score support
0 0.94 0.89 0.91 66
1 0.94 0.96 0.95 105
accuracy 0.94 171
macro avg 0.94 0.93 0.93 171
weighted avg 0.94 0.94 0.94 171
**confusion_matrix**
[[ 59 7]
[ 4 101]]
K Means Clustering is an unsupervised learning algorithm that tries to cluster data based on their similarity. Unsupervised learning means that there is no outcome to be predicted, and the algorithm just tries to find patterns in the data. In k means clustering, we have the specify the number of clusters we want the data to be grouped into. The algorithm randomly assigns each observation to a cluster, and finds the centroid of each cluster. Then, the algorithm iterates through two steps: Reassign data points to the cluster whose centroid is closest. Calculate new centroid of each cluster. These two steps are repeated till the within cluster variation cannot be reduced any further. The within cluster variation is calculated as the sum of the euclidean distance between the data points and their respective cluster centroids.
from sklearn.datasets import make_blobs
# Create Data : creates array of
# 1 array of data (200 lines/samples & 2 columns/features)
# 1 array of center (4 centers = data can belong to 4 categories)
data = make_blobs(n_samples=200, #array of 200 lines
n_features=2, #
centers=4, cluster_std=1.8,random_state=101)
plt.scatter(x=data[0][:,0],
y=data[0][:,1],
c=data[1],
cmap='rainbow')
plt.xlabel('data[0] 1st column')
plt.ylabel('data[0] 2nd column')
plt.title('200 datas x 2 features of 4 centers generated')
Text(0.5, 1.0, '200 datas x 2 features of 4 centers generated')
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, n_init=10)
kmeans.fit(data[0])
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True,figsize=(10,6))
# plot datas and the kmeans labels found
ax1.set_title('K Means')
ax1.scatter(data[0][:,0], #data[0] 1st column
data[0][:,1], #data[0] 2nd column
c=kmeans.labels_, #cluster found
cmap='rainbow')
# VS plot datas and their cluster
ax2.set_title("Original")
ax2.scatter(data[0][:,0],data[0][:,1],c=data[1],cmap='rainbow')
<matplotlib.collections.PathCollection at 0x226cf3674f0>
kmeans.cluster_centers_ #centers found
array([[-4.13591321, 7.95389851],
[-9.46941837, -6.56081545],
[-0.0123077 , 2.13407664],
[ 3.71749226, 7.01388735]])
As we've noticed before it is difficult to visualize high dimensional data, we can use PCA to find the first two principal components, and visualize the data in this new, two-dimensional space, with a single scatter-plot. Before we do this though, we'll need to scale our data so that each feature has a single unit variance.
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
cancer = load_breast_cancer()
df = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
#scale features to standardize everything to the same scale
#neither values with large scale will impact differently the values with small scale
#==> returns values around 0 [-1.... .. +1.....]
scaler = StandardScaler()
scaler.fit(df)
scaled_data = scaler.transform(df)
#scaled_data.shape #(569,30) #30 components
pca = PCA(n_components=2) #find 2 principal components
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
#x_pca.shape #(569, 2) #2 components
#show target color VS distribution of the 2 principal components found
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=cancer['target'],cmap='plasma')
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')
Text(0, 0.5, 'Second Principal Component')
Clearly by using these two components we can easily separate these two classes.
Unfortunately, with this great power of dimensionality reduction, comes the cost of being able to easily understand what these components represent.
The components correspond to combinations of the original features, the components themselves are stored as an attribute of the fitted PCA object:
df_comp = pd.DataFrame(pca.components_,columns=cancer['feature_names'])
df_comp
| mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | ... | worst radius | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.218902 | 0.103725 | 0.227537 | 0.220995 | 0.142590 | 0.239285 | 0.258400 | 0.260854 | 0.138167 | 0.064363 | ... | 0.227997 | 0.104469 | 0.236640 | 0.224871 | 0.127953 | 0.210096 | 0.228768 | 0.250886 | 0.122905 | 0.131784 |
| 1 | -0.233857 | -0.059706 | -0.215181 | -0.231077 | 0.186113 | 0.151892 | 0.060165 | -0.034768 | 0.190349 | 0.366575 | ... | -0.219866 | -0.045467 | -0.199878 | -0.219352 | 0.172304 | 0.143593 | 0.097964 | -0.008257 | 0.141883 | 0.275339 |
2 rows × 30 columns
plt.figure(figsize=(12,6))
sns.heatmap(df_comp,cmap='plasma')
<AxesSubplot: >
#####################################
#####################################
# Check predictions results
#####################################
#####################################
# Image in code cell
Image(filename='imgs/confusionMatrix.JPG')
= 150 /165 = 0.91
Accuracy is useful when target classes are well balanced, BUT not a good choice with **unbalanced** classes!
= 15 /165 = 0.09
TP / (TP + FN)
= 100 / 105 = 0.95
TP / (TP + FP)
= 100 / 110 = 0.91
While recall expresses the ability to find all relevant instances in a dataset
precision expresses the proportion of the data points our model says was relevant actually were relevant.
Image('imgs/confusionMatrixFormulas.png')
Here are three common evaluation metrics for regression problems:
Mean Absolute Error (MAE) is the mean of the absolute value of the errors:
$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$(somme des différences (en valeur absolue) entre valeur prédite et valeur réelle / (nombre de prédictions)
==> large errors not punished)
Mean Squared Error (MSE) is the mean of the squared errors:
$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$(somme des {différences (en valeur absolue) entre valeur prédite et valeur réelle}^2 / (nombre de prédictions)
==> unité -> unité^2)
Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors:
$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$RacineCarré[somme des {différences* entre valeur prédite et valeur réelle}^2 / (nombre de prédictions)]
Comparing these metrics:
All of these are loss functions, because we want to minimize them.
from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import confusion_matrix
print("**classification_report**",)
print(classification_report(y_test,predictions))
print("**confusion_matrix**")
print(confusion_matrix(y_test,predictions))
**classification_report**
precision recall f1-score support
0 0.83 0.90 0.86 163
1 0.82 0.71 0.76 104
accuracy 0.83 267
macro avg 0.83 0.81 0.81 267
weighted avg 0.83 0.83 0.83 267
**confusion_matrix**
[[147 16]
[ 30 74]]
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('datas/u.data', sep='\t', names=column_names)
movie_titles = pd.read_csv('datas/Movie_Id_Titles')
df = pd.merge(df,movie_titles,on='item_id')
df.head()
| user_id | item_id | rating | timestamp | title | |
|---|---|---|---|---|---|
| 0 | 0 | 50 | 5 | 881250949 | Star Wars (1977) |
| 1 | 290 | 50 | 5 | 880473582 | Star Wars (1977) |
| 2 | 79 | 50 | 4 | 891271545 | Star Wars (1977) |
| 3 | 2 | 50 | 5 | 888552084 | Star Wars (1977) |
| 4 | 8 | 50 | 5 | 879362124 | Star Wars (1977) |
ratings = pd.DataFrame(df.groupby('title')['rating'].mean())
ratings['num of ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())
ratings.sort_values(by='num of ratings', ascending=False).head(10)
| rating | num of ratings | |
|---|---|---|
| title | ||
| Star Wars (1977) | 4.359589 | 584 |
| Contact (1997) | 3.803536 | 509 |
| Fargo (1996) | 4.155512 | 508 |
| Return of the Jedi (1983) | 4.007890 | 507 |
| Liar Liar (1997) | 3.156701 | 485 |
| English Patient, The (1996) | 3.656965 | 481 |
| Scream (1996) | 3.441423 | 478 |
| Toy Story (1995) | 3.878319 | 452 |
| Air Force One (1997) | 3.631090 | 431 |
| Independence Day (ID4) (1996) | 3.438228 | 429 |
# for each user_id his rate for each movie
moviemat = df.pivot_table(index='user_id',
columns='title',values='rating')
moviemat.head()
| title | 'Til There Was You (1997) | 1-900 (1994) | 101 Dalmatians (1996) | 12 Angry Men (1957) | 187 (1997) | 2 Days in the Valley (1996) | 20,000 Leagues Under the Sea (1954) | 2001: A Space Odyssey (1968) | 3 Ninjas: High Noon At Mega Mountain (1998) | 39 Steps, The (1935) | ... | Yankee Zulu (1994) | Year of the Horse (1997) | You So Crazy (1994) | Young Frankenstein (1974) | Young Guns (1988) | Young Guns II (1990) | Young Poisoner's Handbook, The (1995) | Zeus and Roxanne (1997) | unknown | Á köldum klaka (Cold Fever) (1994) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user_id | |||||||||||||||||||||
| 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | NaN | NaN | 2.0 | 5.0 | NaN | NaN | 3.0 | 4.0 | NaN | NaN | ... | NaN | NaN | NaN | 5.0 | 3.0 | NaN | NaN | NaN | 4.0 | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 1664 columns
starwars_user_ratings = moviemat['Star Wars (1977)']
liarliar_user_ratings = moviemat['Liar Liar (1997)']
starwars_user_ratings.head(20)
user_id 0 5.0 1 5.0 2 5.0 3 NaN 4 5.0 5 4.0 6 4.0 7 5.0 8 5.0 9 5.0 10 5.0 11 NaN 12 4.0 13 5.0 14 5.0 15 5.0 16 NaN 17 NaN 18 4.0 19 NaN Name: Star Wars (1977), dtype: float64
##################################################
#RECOMMENDER SYSTEM BASED ON CORRELATION CORRWITH
##################################################
similar_to_starwars = moviemat.corrwith(starwars_user_ratings)
similar_to_liarliar = moviemat.corrwith(liarliar_user_ratings)
similar_to_starwars
C:\Python310\lib\site-packages\numpy\lib\function_base.py:2845: RuntimeWarning: Degrees of freedom <= 0 for slice c = cov(x, y, rowvar, dtype=dtype) C:\Python310\lib\site-packages\numpy\lib\function_base.py:2704: RuntimeWarning: divide by zero encountered in divide c *= np.true_divide(1, fact) C:\Python310\lib\site-packages\numpy\lib\function_base.py:2845: RuntimeWarning: Degrees of freedom <= 0 for slice c = cov(x, y, rowvar, dtype=dtype) C:\Python310\lib\site-packages\numpy\lib\function_base.py:2704: RuntimeWarning: divide by zero encountered in divide c *= np.true_divide(1, fact)
title
'Til There Was You (1997) 0.872872
1-900 (1994) -0.645497
101 Dalmatians (1996) 0.211132
12 Angry Men (1957) 0.184289
187 (1997) 0.027398
...
Young Guns II (1990) 0.228615
Young Poisoner's Handbook, The (1995) -0.007374
Zeus and Roxanne (1997) 0.818182
unknown 0.723123
Á köldum klaka (Cold Fever) (1994) NaN
Length: 1664, dtype: float64
corr_starwars = pd.DataFrame(similar_to_starwars,columns=['Correlation'])
corr_starwars.dropna(inplace=True) #remove null values
corr_starwars.head()
| Correlation | |
|---|---|
| title | |
| 'Til There Was You (1997) | 0.872872 |
| 1-900 (1994) | -0.645497 |
| 101 Dalmatians (1996) | 0.211132 |
| 12 Angry Men (1957) | 0.184289 |
| 187 (1997) | 0.027398 |
#add column ratings['num of ratings'] merging based on index
corr_starwars = corr_starwars.join(ratings['num of ratings'])
corr_starwars.sort_values('Correlation',ascending=False)
#some weird correlation : This is because there are a lot of movies
#only watched once by users who also watched star wars (it was the most popular movie)
| Correlation | num of ratings | |
|---|---|---|
| title | ||
| Hollow Reed (1996) | 1.0 | 6 |
| Commandments (1997) | 1.0 | 3 |
| Cosi (1996) | 1.0 | 4 |
| No Escape (1994) | 1.0 | 5 |
| Stripes (1981) | 1.0 | 5 |
| ... | ... | ... |
| For Ever Mozart (1996) | -1.0 | 3 |
| Frankie Starlight (1995) | -1.0 | 4 |
| I Like It Like That (1994) | -1.0 | 3 |
| American Dream (1990) | -1.0 | 2 |
| Theodore Rex (1995) | -1.0 | 5 |
1410 rows × 2 columns
#most rated movies
plt.figure(figsize=(10,4))
ratings['num of ratings'].hist(bins=70)
plt.xlabel('Num of ratings')
Text(0.5, 0, 'Num of ratings')
# check correlation of movies having more than 100 ratings
corr_starwars[corr_starwars['num of ratings']>100].sort_values(
'Correlation',ascending=False).head()
| Correlation | num of ratings | |
|---|---|---|
| title | ||
| Star Wars (1977) | 1.000000 | 584 |
| Empire Strikes Back, The (1980) | 0.748353 | 368 |
| Return of the Jedi (1983) | 0.672556 | 507 |
| Raiders of the Lost Ark (1981) | 0.536117 | 420 |
| Austin Powers: International Man of Mystery (1997) | 0.377433 | 130 |